#########
# This script imports and cleans the jobseeker click data
# Author: Jeremias Kl�ui
########

library(data.table)
library(stringr)
library(readstata13)

rm(list = ls())

setwd(dirname(rstudioapi::getSourceEditorContext()$path))
getwd()

# Load data  --------------------------------------------------------

# load inserat_klicks and ostejr 
load("data_raw/OSTE_JR.RData")

# load stes_jr(_21), inserat_user, stes_merk, stes_sp 
load("data_raw/STES_JR_200522.RData")

# convert the tibbles to data tables
# "Ihr k�nnt stes_jr_old und stes_user_Q4_2020 ignorieren, f�r die Daten bis 2020 ist stes_jr das endg�ltige dataframe" 20-05-2022

inserat_klicks<-as.data.table(inserat_klicks)
inserat_user<-as.data.table(inserat_user)
stes_jr<-as.data.table(stes_jr)
stes_jr_21<-as.data.table(stes_jr_21)
ostejr<-as.data.table(ostejr)

names(inserat_user)<-str_to_lower(colnames(inserat_user))
names(inserat_klicks)<-str_to_lower(colnames(inserat_klicks))

### The mapping table persnr + month to stes id 
persnr_to_stes_id<-fread("data_processed/persnr_to_stes_id.csv")

# Merge to Stes_id -------------------------------------------------------------------

### Drop the anonymous users
inserat_klicks[, table(user=="anonymousUser")]
inserat_klicks<-inserat_klicks[user!="anonymousUser"]

### mapping user to persnr #####

# 2021 data
y21_user_to_persnr<-merge(inserat_user, stes_jr_21[, .(cfa_jobr_user, persnr)], by="cfa_jobr_user")
y21_user_to_persnr<-unique(y21_user_to_persnr[,.(user, persnr)])

# 2020 data
y20_user_to_persnr<- unique(stes_jr[pid>0, .(user=id, persnr=pid)])

# Bind both together
user_to_persnr<-unique(rbind(y20_user_to_persnr, y21_user_to_persnr), by="user")

# merge with the clicks
ad_clicks<-merge(inserat_klicks, user_to_persnr, by="user", all.x=T)

### merge persnr to stes_id using the month of the click
ad_clicks[, month:=as.Date(floor_date(zeit, "month"))]
ad_clicks<-merge(ad_clicks, persnr_to_stes_id, all.x=T, by=c("persnr", "month"))
ad_clicks[, month:=NULL] # redundant with "zeit"


# Some descriptives on the merge --------------------------------------------

# share of non-anonymous clicks where we have the persnr
ad_clicks[, sum(!is.na(persnr))/.N*100] # 89%

# 1) 19% of users have no person-number
# 2) 3.3%  have a person-number but no spell id (some of them might be jobseeker who clicked outside of their spell)
# 3) Rest is fine
ad_clicks[, uniqueN(user), by= .(has_persnr=!is.na(persnr), has_stes_id=!is.na(stes_id))][, share:=V1/sum(V1)*100][, print(.SD)]


# Merge to OSTE-ID (where available) --------------------------------------

jrhash_to_oste<-unique(ostejr[,.(ad_hash=id, oste_id_avam=as.integer(oste_id_avam))])

setnames(ad_clicks, "id", "ad_hash")

ad_clicks<-merge(ad_clicks, jrhash_to_oste, by="ad_hash", all.x=T)


# Clean a bit -------------------------------------------------------------

ad_clicks[, rollen:=as.factor(rollen)]
ad_clicks[, status:=as.factor(status)]


# Save in Stata format with time and date separated
ad_clicks[, time:=format(zeit, "%H:%M:%S")]
ad_clicks[, date:=as.Date(zeit)]
readstata13::save.dta13(ad_clicks, "data_processed/ad_clicks.dta")



